{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-processing WADI dataset\n",
    "\n",
    "Following implementation of [Deng & Hooi (2021)](https://bhooi.github.io/papers/gdn_aaai2021.pdf): (1) MinMax Normalization of Data and (2) Down Sampling of Data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import packages and dataset\n",
    "\n",
    "After downloading the dataset from [iTrust](https://itrust.sutd.edu.sg/), we use the dataset - <u>WADI.A1_9 Oct 2017</u>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "from __future__ import unicode_literals\n",
    "\n",
    "import pandas as pd\n",
    "from datetime import datetime, timedelta\n",
    "from dateutil import parser\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = \"wadi2017/WADI_14days.csv\" ## Training dataset\n",
    "test_path = \"wadi2017/WADI_attackdata.csv\"\n",
    "output_dir = \"../data/wadi\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Process Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Process Training Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(train_path,header=3)\n",
    "train = train.fillna(0)\n",
    "train = train.drop(columns=['Row'])\n",
    "train['Date'] = train['Date'].apply(lambda x: '/'.join([i.zfill(2) for i in x.split('/')]))\n",
    "train['Time'] = train['Time'].apply(lambda x: x.replace('.000','')) \n",
    "train['Time'] = train['Time'].apply(lambda x: ':'.join([i.zfill(2) for i in x.split(':')]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['datetime'] = train.apply(lambda x: datetime.strptime(x.Date +' '+x.Time,'%m/%d/%Y %I:%M:%S %p'),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert train['datetime'].tolist() == train.apply(lambda x: parser.parse(x.Date +' '+x.Time,fuzzy=True),\n",
    "                                                 axis=1).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename and resort\n",
    "coi = ['datetime'] + [i for i in train.columns if i not in ['Date','Time','datetime']]\n",
    "train = train[coi]\n",
    "train = train.sort_values('datetime')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "empty_cols = [col for col in train.columns if train[col].isnull().all()]\n",
    "train[empty_cols] = train[empty_cols].fillna(0,inplace=True)\n",
    "\n",
    "for i in train.columns[train.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values\n",
    "    train[i].fillna(train[i].mean(),inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__MINMAX normalization__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "scaler = MinMaxScaler()\n",
    "X_train = train.iloc[:,1:].to_numpy()\n",
    "scaler.fit(X_train)\n",
    "normalized_train_X = scaler.transform(X_train)\n",
    "\n",
    "normalized_train_X\n",
    "\n",
    "normalized_train = train.copy()\n",
    "normalized_train.iloc[:,1:] = normalized_train_X "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Downsample data__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_train = normalized_train.groupby(np.arange(len(train))//10).median()\n",
    "time = [normalized_train.iloc[0,0] + timedelta(seconds=10*i) for i in range(0,len(filtered_train))]\n",
    "\n",
    "filtered_train['datetime'] = time\n",
    "filtered_train = filtered_train.iloc[2160:]\n",
    "filtered_train = filtered_train.iloc[:-1,:]\n",
    "filtered_train = filtered_train[coi] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename\n",
    "import re \n",
    "pat = re.escape('\\\\\\\\WIN-25J4RO10SBF\\\\LOG_DATA\\\\SUTD_WADI\\\\LOG_DATA\\\\')\n",
    "\n",
    "rename_col = [re.sub(pat,'',i) for i in filtered_train.columns]\n",
    "filtered_train.columns = rename_col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_train = filtered_train.reset_index().drop(columns=['datetime','index'])\n",
    "\n",
    "final_train = final_train.reset_index().rename(columns={'index':'timestamp'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Process Test Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Inject anomaly labels based on the pdf table__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "attacks = []\n",
    "\n",
    "# Attack 1 \n",
    "start = datetime(2017,10,9,19,25,00)\n",
    "end = datetime(2017,10,9,19,50,16)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "print(attacks[0],attacks[-1])\n",
    "# Attack 2\n",
    "start = datetime(2017,10,10,10,24,10)\n",
    "end = datetime(2017,10,10,10,34,00)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "# Attack 3-4\n",
    "start = datetime(2017,10,10,10,55,00)\n",
    "end = datetime(2017,10,10,11,24,00)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "#attacks['1_AIT_001'] = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds()))]\n",
    "\n",
    "# Attack 5\n",
    "start = datetime(2017,10,10,11,30,40)\n",
    "end = datetime(2017,10,10,11,44,50)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "# Attack 6\n",
    "start = datetime(2017,10,10,13,39,30)\n",
    "end = datetime(2017,10,10,13,50,40)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "#attacks['2_MCV_101'] = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds()))]\n",
    "\n",
    "# Attack 7\n",
    "start = datetime(2017,10,10,14,48,17)\n",
    "end = datetime(2017,10,10,14,59,55)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "\n",
    "# Attack 8\n",
    "start = datetime(2017,10,10,17,40,00)\n",
    "end = datetime(2017,10,10,17,49,40)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "\n",
    "# Attack 9\n",
    "start = datetime(2017,10,10,10,55,00)\n",
    "end = datetime(2017,10,10,10,56,27)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "# Attack 10\n",
    "start = datetime(2017,10,11,11,17,54)\n",
    "end = datetime(2017,10,11,11,31,20)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "\n",
    "# Attack 11\n",
    "start = datetime(2017,10,11,11,36,31)\n",
    "end = datetime(2017,10,11,11,47,00)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "\n",
    "# Attack 12\n",
    "start = datetime(2017,10,11,11,59,00)\n",
    "end = datetime(2017,10,11,12,5,00)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "\n",
    "# Attack 13\n",
    "start = datetime(2017,10,11,12,7,30)\n",
    "end = datetime(2017,10,11,12,10,52)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "# Attack 14\n",
    "start = datetime(2017,10,11,12,16,00)\n",
    "end = datetime(2017,10,11,12,25,36)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "# Attack 15\n",
    "start = datetime(2017,10,11,15,26,30)\n",
    "end = datetime(2017,10,11,15,37,00)\n",
    "\n",
    "attacks += [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "tt = [start + timedelta(seconds=1*i) for i in range(int((end-start).total_seconds())+1)]\n",
    "\n",
    "print(tt[0],tt[-1])\n",
    "\n",
    "attacks_set = set(attacks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Import Test Data__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv(test_path)\n",
    "#test = test.fillna(0)\n",
    "test = test.drop(columns=['Row'])\n",
    "test['Date'] = test['Date'].apply(lambda x: '/'.join([i.zfill(2) for i in x.split('/')]))\n",
    "test['Time'] = test['Time'].apply(lambda x: x.replace('.000','')) \n",
    "test['Time'] = test['Time'].apply(lambda x: ':'.join([i.zfill(2) for i in x.split(':')]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test['datetime'] = test.apply(lambda x: datetime.strptime(x.Date +' '+x.Time,'%m/%d/%Y %I:%M:%S %p'),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "empty_cols = [col for col in test.columns if test[col].isnull().all()]\n",
    "test[empty_cols] = test[empty_cols].fillna(0)\n",
    "\n",
    "for i in test.columns[test.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values\n",
    "    test[i].fillna(test[i].mean(),inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def attacked(datetime,datetime_list):\n",
    "    if datetime in datetime_list:\n",
    "        return int(1)\n",
    "    else:\n",
    "        return int(0)\n",
    "\n",
    "test['attack'] = test['datetime'].apply(lambda x: attacked(x,attacks_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename and resort\n",
    "coi = ['datetime'] + [i for i in train.columns if i not in ['Date','Time','datetime']]\n",
    "coi += ['attack']\n",
    "test = test[coi]\n",
    "test = test.sort_values('datetime')\n",
    "\n",
    "#test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Minmax Normalization__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## NORMALIZATION\n",
    "X_test = test.iloc[:,1:-1].to_numpy()\n",
    "normalized_test_X = scaler.transform(X_test)\n",
    "\n",
    "normalized_test = test.copy()\n",
    "normalized_test.iloc[:,1:-1] = normalized_test_X "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Downsample__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_test = normalized_test.iloc[:,1:].groupby(np.arange(len(test.iloc[:,1:]))//10).median()\n",
    "max_ftest = normalized_test.iloc[:,1:].groupby(np.arange(len(test.iloc[:,1:]))//10).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_test = filtered_test.iloc[:-1,:].copy()\n",
    "final_test['attack'] = final_test['attack'].round()\n",
    "final_test['datetime'] = [test['datetime'][0] + timedelta(seconds=10*i) for i in range(0,len(final_test))]\n",
    "\n",
    "## Renaming\n",
    "import re \n",
    "pat = re.escape('\\\\\\\\WIN-25J4RO10SBF\\\\LOG_DATA\\\\SUTD_WADI\\\\LOG_DATA\\\\')\n",
    "\n",
    "rename_col = [re.sub(pat,'',i) for i in final_test.columns]\n",
    "final_test.columns = rename_col\n",
    "\n",
    "## Sort columns\n",
    "sort_col = ['datetime'] + [i for i in final_test if i != 'datetime']\n",
    "final_test = final_test[sort_col]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_test = final_test.reset_index().drop(columns=['datetime','index'])\n",
    "\n",
    "final_test = final_test.reset_index().rename(columns={'index':'timestamp'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate Output Files "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Import function for data generation__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def generate_graph_seq2seq_io_data(\n",
    "        df, x_offsets, y_offsets, scaler=None):\n",
    "    \"\"\"\n",
    "    Generate samples from\n",
    "    :param df:\n",
    "    :param x_offsets:\n",
    "    :param y_offsets:\n",
    "    :param add_time_in_day:\n",
    "    :param add_day_in_week:\n",
    "    :param scaler:\n",
    "    :return:\n",
    "    # x: (epoch_size, input_length, num_nodes, input_dim)\n",
    "    # y: (epoch_size, output_length, num_nodes, output_dim)\n",
    "    \"\"\"\n",
    "    ## FOR WADI and SWAT DATASET\n",
    "    #df = df.set_index('datetime')\n",
    "\n",
    "    if 'attack' in df.columns:\n",
    "        df = df.drop(columns=['attack'])\n",
    "    if 'timestamp' in df.columns:\n",
    "        df = df.drop(columns=['timestamp'])\n",
    "    \n",
    "    num_samples, num_nodes = df.shape\n",
    "    data = np.expand_dims(df.values, axis=-1)\n",
    "    data_list = [data]\n",
    "    data = np.concatenate(data_list, axis=-1)\n",
    "    # epoch_len = num_samples + min(x_offsets) - max(y_offsets)\n",
    "    x, y = [], []\n",
    "    # t is the index of the last observation.\n",
    "    min_t = abs(min(x_offsets))\n",
    "    max_t = abs(num_samples - abs(max(y_offsets)))  # Exclusive\n",
    "    for t in range(min_t, max_t):\n",
    "        x_t = data[t + x_offsets, ...]\n",
    "        y_t = data[t + y_offsets, ...]\n",
    "        x.append(x_t)\n",
    "        y.append(y_t)\n",
    "    x = np.stack(x, axis=0)\n",
    "    y = np.stack(y, axis=0)\n",
    "    return x, y\n",
    "\n",
    "\n",
    "def generate_train_val_test(df,test_size,val_ratio,window_size,output_dir):\n",
    "    df = df.reset_index(drop=True)\n",
    "    x_offsets = np.sort(\n",
    "        np.concatenate((np.arange(-(window_size-1), 1, 1),))\n",
    "    )\n",
    "    # Predict the next one hour\n",
    "    y_offsets = np.sort(np.arange(1, 2, 1))\n",
    "    # x: (num_samples, input_length, num_nodes, input_dim)\n",
    "    # y: (num_samples, output_length, num_nodes, output_dim)\n",
    "    x, y = generate_graph_seq2seq_io_data(\n",
    "        df,\n",
    "        x_offsets=x_offsets,\n",
    "        y_offsets=y_offsets,\n",
    "    )\n",
    "\n",
    "    print(\"x shape: \", x.shape, \", y shape: \", y.shape)\n",
    "    # Write the data into npz file.\n",
    "    # num_test = 6831, using the last 6831 examples as testing.\n",
    "    # for the rest: 7/8 is used for training, and 1/8 is used for validation.\n",
    "    num_samples = x.shape[0]\n",
    "    num_test = test_size\n",
    "    train_samples = num_samples - num_test\n",
    "    num_train = round(train_samples * (1-val_ratio))\n",
    "    num_val = train_samples - num_train\n",
    "\n",
    "    # train\n",
    "    x_train, y_train = x[:num_train], y[:num_train]\n",
    "    # val\n",
    "    x_val, y_val = (\n",
    "        x[num_train: num_train + num_val],\n",
    "        y[num_train: num_train + num_val],\n",
    "    )\n",
    "    # test\n",
    "    x_test, y_test = (\n",
    "        x[-num_test: ],\n",
    "        y[-num_test: ],\n",
    "    )\n",
    "    for cat in [\"train\", \"val\",\"test\"]:\n",
    "        _x, _y = locals()[\"x_\" + cat], locals()[\"y_\" + cat]\n",
    "        print(cat, \"x: \", _x.shape, \"y:\", _y.shape)\n",
    "        np.savez_compressed(\n",
    "            os.path.join(output_dir, \"%s.npz\" % cat),\n",
    "            x=_x,\n",
    "            y=_y,\n",
    "            x_offsets=x_offsets.reshape(list(x_offsets.shape) + [1]),\n",
    "            y_offsets=y_offsets.reshape(list(y_offsets.shape) + [1]),\n",
    "        )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "__Data Generate__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters for data generation\n",
    "val_ratio = 0.1\n",
    "window_size = 5\n",
    "# Output dir \n",
    "output_dir = '../data/wadi'\n",
    "if 'wadi' not in output_dir:\n",
    "    os.mkdir('../data/wadi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = final_test.attack.tolist()\n",
    "## Generate anomaly labels\n",
    "with open(output_dir+'/anomaly_labels.txt','w') as f:\n",
    "    f.write(','.join([str(i) for i in labels]))\n",
    "    \n",
    "final_test = final_test.drop(columns=['attack'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = final_train.append([final_test]).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Generate final dataset\n",
    "generate_train_val_test(data,len(final_test),val_ratio,window_size,output_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
